library(readr)
Untreated <- readRDS(paste0(wd,"/data/NCI_TPW_gep_untreated.rds"))
Treated <- readRDS(paste0(wd,"/data/NCI_TPW_gep_treated.rds"))
Metadata = read.table(paste0(wd,"/data/NCI_TPW_metadata.tsv"), header = TRUE, sep ="\t", stringsAsFactors = TRUE)
Cellline_annotation = read.table(paste0(wd,"/data/cellline_annotation.tsv"), header = TRUE, sep ="\t", stringsAsFactors = TRUE)
Drug_annotation = read.table(paste0(wd,"/data/drug_annotation.tsv"), header = TRUE, sep ="\t", stringsAsFactors = TRUE)
Treated <- as.data.frame(Treated)
Untreated <- as.data.frame(Untreated)
Biomarkers over “normal” fold change, without absolute values:
### creat FC data
FC <- TreatedVorinostat - UntreatedVorinostat
# work with mean of the rows because we only want to compare the genes
FC_meanrow= rowMeans(FC)
#################################################################################################################
### sort the data
# work with absolute value to find the highest values
# because we want to have the most up and down regulated genes
FC_abs= abs(FC_meanrow)
## sort the values to get the 100 largest values
sortedFC_abs <- sort(FC_abs, decreasing = TRUE)
sortedFC_abs <- as.matrix(sortedFC_abs)
# take the first 100 for biomarkers
biomarkers_FC = sortedFC_abs[1:100,]
biomarkers_FC <- as.matrix(biomarkers_FC)
# see that the last ones have very similar values
# creat vector with gene names
biomarkers_FC_genes= row.names(biomarkers_FC)
#################################################################################################################
### creat matrix with FC values (positive and negativ)
# add the abs values to FC matrix
FC_both= cbind(FC_meanrow,FC_abs)
FC_both=as.data.frame(FC_both)
# order this matrix
FC_both_sorted <- FC_both[order(FC_both$FC_abs, decreasing = TRUE),]
# FC values of biomarkers
# take the first 100 of the sorted matrix, should be the same as un biomarkers_FC
biomarkers_FC_values = FC_both_sorted[1:100,]
# remove the absolute values
biomarkers_FC_values <- subset( biomarkers_FC_values, select = -FC_abs)
biomarkers_FC_values = as.matrix(biomarkers_FC_values)
Gene Ontology analysis, more plots and information, but inconclusive
################################# Gene Ontology analysis with FC biomarkers ##########################################
# load biomarkers over FC
# matrix we will work with:
biomarkers_FC_genes= row.names(biomarkers_FC)
biomarkers_FC_values = as.matrix(biomarkers_FC_values)
# load the library
library(clusterProfiler)
##
## clusterProfiler v3.10.1 For help: https://guangchuangyu.github.io/software/clusterProfiler
##
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
#################################################################################################################
### translate the gene names
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:BBmisc':
##
## normalize
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, append,
## as.data.frame, basename, cbind, colMeans, colSums, colnames,
## dirname, do.call, duplicated, eval, evalq, get, grep, grepl,
## intersect, is.unsorted, lapply, lengths, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, rank, rbind,
## rowMeans, rowSums, rownames, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:BBmisc':
##
## collapse
##
# do the translation
# index 2 because already done analysis with biomarkers from p.values
translated.genes2= bitr(biomarkers_FC_genes,fromType="SYMBOL", toType="ENTREZID",OrgDb = org.Hs.eg.db)
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(biomarkers_FC_genes, fromType = "SYMBOL", toType =
## "ENTREZID", : 10% of input gene IDs are fail to map...
head(translated.genes2)
## SYMBOL ENTREZID
## 1 DHRS2 10202
## 2 ABAT 18
## 3 SERPINI1 5274
## 6 CLU 1191
## 7 NMI 9111
## 8 STC1 6781
#################################################################################################################
#### Gene Ontology Classification
# Notes: BP for Biological Process, MF for Molecular Function, and CC for Cellular Component
library(DOSE)
## Warning: package 'DOSE' was built under R version 3.5.2
## DOSE v3.8.2 For help: https://guangchuangyu.github.io/DOSE
##
## If you use DOSE in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
# take the needed gene ID
gene2=translated.genes2$ENTREZID
head(gene2)
## [1] "10202" "18" "5274" "1191" "9111" "6781"
### do classification
## 1. Biological Process
ggo2.1 <- groupGO(gene= gene2, OrgDb = org.Hs.eg.db, ont = "BP", level = 3, readable = FALSE)
head(summary(ggo2.1))
## Warning in summary(ggo2.1): summary method to convert the object to
## data.frame is deprecated, please use as.data.frame instead.
## ID Description Count
## GO:0019953 GO:0019953 sexual reproduction 4
## GO:0019954 GO:0019954 asexual reproduction 0
## GO:0022414 GO:0022414 reproductive process 8
## GO:0032504 GO:0032504 multicellular organism reproduction 4
## GO:0032505 GO:0032505 reproduction of a single-celled organism 0
## GO:0061887 GO:0061887 reproduction of symbiont in host 0
## GeneRatio geneID
## GO:0019953 4/90 18/8061/330/8900
## GO:0019954 0/90
## GO:0022414 8/90 18/6781/374/2353/10370/8061/330/8900
## GO:0032504 4/90 6781/8061/330/8900
## GO:0032505 0/90
## GO:0061887 0/90
# visualization
barplot(ggo2.1, drop=TRUE, showCategory=12)
## 2. Molecular Function
ggo2.2 <- groupGO(gene= gene2, OrgDb = org.Hs.eg.db, ont = "MF", level = 3, readable = FALSE)
head(summary(ggo2.2))
## Warning in summary(ggo2.2): summary method to convert the object to
## data.frame is deprecated, please use as.data.frame instead.
## ID
## GO:0001070 GO:0001070
## GO:0001072 GO:0001072
## GO:0001073 GO:0001073
## GO:0001134 GO:0001134
## GO:0003700 GO:0003700
## GO:0003711 GO:0003711
## Description
## GO:0001070 RNA-binding transcription regulator activity
## GO:0001072 transcription antitermination factor activity, RNA binding
## GO:0001073 transcription antitermination factor activity, DNA binding
## GO:0001134 transcription regulator recruiting activity
## GO:0003700 DNA-binding transcription factor activity
## GO:0003711 transcription elongation regulator activity
## Count GeneRatio geneID
## GO:0001070 0 0/90
## GO:0001072 0 0/90
## GO:0001073 0 0/90
## GO:0001134 0 0/90
## GO:0003700 9 9/90 8091/4790/2353/10370/8061/2305/2969/23635/467
## GO:0003711 0 0/90
# visualization
barplot(ggo2.2, drop=TRUE, showCategory=12)
## 3. Cellular Component
ggo2.3 <- groupGO(gene= gene2, OrgDb = org.Hs.eg.db, ont = "CC", level = 3, readable = FALSE)
head(summary(ggo2.3))
## Warning in summary(ggo2.3): summary method to convert the object to
## data.frame is deprecated, please use as.data.frame instead.
## ID Description Count GeneRatio
## GO:0005886 GO:0005886 plasma membrane 25 25/90
## GO:0005628 GO:0005628 prospore membrane 0 0/90
## GO:0005789 GO:0005789 endoplasmic reticulum membrane 4 4/90
## GO:0019867 GO:0019867 outer membrane 1 1/90
## GO:0031090 GO:0031090 organelle membrane 17 17/90
## GO:0034357 GO:0034357 photosynthetic membrane 0 0/90
## geneID
## GO:0005886 6781/57030/8744/55256/4758/79850/360/5168/23208/7205/11151/80328/8061/4900/1490/10486/8829/55915/6515/1368/27340/6253/27131/4804/306
## GO:0005628
## GO:0005789 374/9526/6253/2281
## GO:0019867 637
## GO:0031090 1191/374/7298/57030/4758/4001/23208/51131/11151/4900/57134/6515/64921/27131/637/2281/306
## GO:0034357
# visualization
barplot(ggo2.3, drop=TRUE, showCategory=12)
#################################################################################################################
### enrich GO
# only works with gene id ENSEMBL
library(org.Hs.eg.db)
# create gene data frame ans translate it
gene2.2 <- row.names(biomarkers_FC_values)
gene.df2 <- bitr(gene2.2, fromType = "SYMBOL",
toType = c("ENSEMBL", "ENTREZID"),
OrgDb = org.Hs.eg.db)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(gene2.2, fromType = "SYMBOL", toType = c("ENSEMBL",
## "ENTREZID"), : 10% of input gene IDs are fail to map...
##############################################################################################################
### Cellular Component
ego2.1 <- enrichGO(gene = gene.df2$ENSEMBL,
OrgDb = org.Hs.eg.db,
keyType = 'ENSEMBL',
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
head(summary(ego2.1))
## Warning in summary(ego2.1): summary method to convert the object to
## data.frame is deprecated, please use as.data.frame instead.
## ID Description GeneRatio BgRatio
## GO:0035580 GO:0035580 specific granule lumen 9/98 83/21794
## GO:0042581 GO:0042581 specific granule 11/98 213/21794
## GO:0043202 GO:0043202 lysosomal lumen 8/98 111/21794
## GO:0034774 GO:0034774 secretory granule lumen 12/98 374/21794
## GO:0060205 GO:0060205 cytoplasmic vesicle lumen 12/98 392/21794
## GO:0031983 GO:0031983 vesicle lumen 12/98 393/21794
## pvalue p.adjust qvalue
## GO:0035580 1.290481e-10 2.864867e-08 2.445121e-08
## GO:0042581 3.196320e-09 3.547915e-07 3.028092e-07
## GO:0043202 3.773459e-08 2.792360e-06 2.383237e-06
## GO:0034774 1.180312e-07 6.550730e-06 5.590950e-06
## GO:0060205 1.958048e-07 7.445533e-06 6.354652e-06
## GO:0031983 2.012306e-07 7.445533e-06 6.354652e-06
## geneID
## GO:0035580 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957/ENSG00000109320
## GO:0042581 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957/ENSG00000109320/ENSG00000059804/ENSG00000138772
## GO:0043202 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0034774 ENSG00000163536/ENSG00000120885/ENSG00000103196/ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957/ENSG00000109320
## GO:0060205 ENSG00000163536/ENSG00000120885/ENSG00000103196/ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957/ENSG00000109320
## GO:0031983 ENSG00000163536/ENSG00000120885/ENSG00000103196/ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957/ENSG00000109320
## Count
## GO:0035580 9
## GO:0042581 11
## GO:0043202 8
## GO:0034774 12
## GO:0060205 12
## GO:0031983 12
# visualization
dotplot(ego2.1, showCategory=3)
cnetplot(ego2.1)
emapplot(ego2.1)
##############################################################################################################
### Biological Process
ego2.2 <- enrichGO(gene = gene.df2$ENSEMBL,
OrgDb = org.Hs.eg.db,
keyType = 'ENSEMBL',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
head(summary(ego2.2))
## Warning in summary(ego2.2): summary method to convert the object to
## data.frame is deprecated, please use as.data.frame instead.
## ID Description GeneRatio
## GO:0006689 GO:0006689 ganglioside catabolic process 8/95
## GO:0009313 GO:0009313 oligosaccharide catabolic process 8/95
## GO:0046479 GO:0046479 glycosphingolipid catabolic process 8/95
## GO:0019377 GO:0019377 glycolipid catabolic process 8/95
## GO:0046514 GO:0046514 ceramide catabolic process 8/95
## GO:0001573 GO:0001573 ganglioside metabolic process 8/95
## BgRatio pvalue p.adjust qvalue
## GO:0006689 14/20505 4.609982e-16 9.385924e-13 8.477514e-13
## GO:0009313 22/20505 4.762652e-14 3.232253e-11 2.919422e-11
## GO:0046479 22/20505 4.762652e-14 3.232253e-11 2.919422e-11
## GO:0019377 24/20505 1.087160e-13 5.533646e-11 4.998076e-11
## GO:0046514 25/20505 1.592733e-13 6.485610e-11 5.857906e-11
## GO:0001573 30/20505 8.457689e-13 2.869976e-10 2.592208e-10
## geneID
## GO:0006689 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0009313 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0046479 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0019377 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0046514 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0001573 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## Count
## GO:0006689 8
## GO:0009313 8
## GO:0046479 8
## GO:0019377 8
## GO:0046514 8
## GO:0001573 8
# visualization
dotplot(ego2.2, showCategory=10)
cnetplot(ego2.2)
emapplot(ego2.2)
##############################################################################################################
### Molecular Function
ego2.3 <- enrichGO(gene = gene.df2$ENSEMBL,
OrgDb = org.Hs.eg.db,
keyType = 'ENSEMBL',
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
head(summary(ego2.3))
## Warning in summary(ego2.3): summary method to convert the object to
## data.frame is deprecated, please use as.data.frame instead.
## ID Description
## GO:0052794 GO:0052794 exo-alpha-(2->3)-sialidase activity
## GO:0052795 GO:0052795 exo-alpha-(2->6)-sialidase activity
## GO:0052796 GO:0052796 exo-alpha-(2->8)-sialidase activity
## GO:0004308 GO:0004308 exo-alpha-sialidase activity
## GO:0016997 GO:0016997 alpha-sialidase activity
## GO:0004553 GO:0004553 hydrolase activity, hydrolyzing O-glycosyl compounds
## GeneRatio BgRatio pvalue p.adjust qvalue
## GO:0052794 8/96 12/19626 1.184943e-16 1.228391e-14 1.130893e-14
## GO:0052795 8/96 12/19626 1.184943e-16 1.228391e-14 1.130893e-14
## GO:0052796 8/96 12/19626 1.184943e-16 1.228391e-14 1.130893e-14
## GO:0004308 8/96 14/19626 7.131444e-16 4.435758e-14 4.083690e-14
## GO:0016997 8/96 14/19626 7.131444e-16 4.435758e-14 4.083690e-14
## GO:0004553 9/96 105/19626 2.233203e-09 1.157544e-07 1.065669e-07
## geneID
## GO:0052794 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0052795 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0052796 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0004308 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0016997 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957
## GO:0004553 ENSG00000204386/ENSG00000227315/ENSG00000234343/ENSG00000234846/ENSG00000227129/ENSG00000184494/ENSG00000228691/ENSG00000223957/ENSG00000117643
## Count
## GO:0052794 8
## GO:0052795 8
## GO:0052796 8
## GO:0004308 8
## GO:0016997 8
## GO:0004553 9
# visualization
dotplot(ego2.3, showCategory=10)
cnetplot(ego2.3)
emapplot(ego2.3)
KEGG enrichment, better then GO, fits with literature to vorinostat
library(KEGG.db)
##
## KEGG.db contains mappings based on older data because the original
## resource was removed from the the public domain before the most
## recent update was produced. This package should now be
## considered deprecated and future versions of Bioconductor may
## not have it available. Users who want more current data are
## encouraged to look at the KEGGREST or reactome.db packages
# see for other applications of this libary:
# https://bioconductor.org/packages/release/data/annotation/manuals/KEGG.db/man/KEGG.db.pdf
###################################################################################################
# works better with biomarkers from FC
library(org.Hs.eg.db)
gene2 <- row.names(biomarkers_FC)
gene.df <- bitr(gene2, fromType = "SYMBOL",
toType = c("ENSEMBL", "ENTREZID"),
OrgDb = org.Hs.eg.db)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(gene2, fromType = "SYMBOL", toType = c("ENSEMBL",
## "ENTREZID"), : 10% of input gene IDs are fail to map...
kk <- enrichKEGG(gene=gene.df$ENTREZID,pvalueCutoff = 0.05)
head(summary(kk))
## Warning in summary(kk): summary method to convert the object to data.frame
## is deprecated, please use as.data.frame instead.
## ID Description GeneRatio
## hsa05202 hsa05202 Transcriptional misregulation in cancer 7/46
## hsa04215 hsa04215 Apoptosis - multiple species 3/46
## hsa05203 hsa05203 Viral carcinogenesis 6/46
## hsa04210 hsa04210 Apoptosis 5/46
## hsa05166 hsa05166 Human T-cell leukemia virus 1 infection 6/46
## BgRatio pvalue p.adjust qvalue
## hsa05202 186/7866 9.055915e-05 0.01331219 0.01162970
## hsa04215 33/7866 9.032146e-04 0.04092195 0.03574993
## hsa05203 201/7866 1.032008e-03 0.04092195 0.03574993
## hsa04210 136/7866 1.113522e-03 0.04092195 0.03574993
## hsa05166 219/7866 1.605648e-03 0.04720605 0.04123980
## geneID Count
## hsa05202 8091/4790/330/4291/4804/8900/1026 7
## hsa04215 330/637/4804 3
## hsa05203 3017/4790/8349/8365/8900/1026 6
## hsa04210 4001/4790/2353/330/637 5
## hsa05166 4790/2353/8061/8829/8900/1026 6
# visualization
barplot(kk,showCategory=12)
dotplot(kk, showCategory=12)
cnetplot(kk,categorySize="geneNum")
emapplot(kk)
# problem: only finds 5 categories